import re
import random
import math


class Ngram_Language_Model:
    """The class implements a Markov Language Model that learns amodel from a given text.
        It supoprts language generation and the evaluation of a given string.
        The class can be applied on both word level and caracter level.
    """

    def __init__(self, n=3, chars=False, log_base=math.e, stupid_backoff_alpha=0.3):
        """Initializing a language model object.

        NOTE:
        self.n_grams_by_len is a list containing self.n dictionaries, one for each size of n_gram (from 1 to self.n).
        Read note in function self.build_model to understand why I collected all these different sized n_grams.

        Arges:
            n (int): the length of the markov unit (the n of the n-gram). Defaults to 3.
            chars (bool): True iff the model consists of ngrams of characters rather then word tokens.
                          Defaults to False
        """
        self.n = n
        self.chars = chars
        self.log_base = log_base
        self.n_grams_by_len = []
        self.corpus_len = 0
        self.stupid_backoff_alpha = stupid_backoff_alpha

    def join(self, tokens):
        """
        Join the tokens in a single string, according to the mode (chars or not)

        :param tokens: to join
        :return: joined string
        """
        if self.chars:
            joiner = ''
        else:
            joiner = ' '
        return joiner.join(tokens)

    def split(self, string):
        """
        Splits the string into tokens, according to the mode (chars or not)

        :param string: to split
        :return: list of tokens
        """
        if self.chars:
            return list(string)
        else:
            return string.split(' ')

    def build_model(self, text):
        """populates a dictionary counting all ngrams in the specified text.

            Note:
            I decided to collect all the n_grams from size 1 to self.n, in order to be able to generate/validate
            more accurately texts that have contexts of size self.n - 1 that don't exist in the model BUT that
            one of the inner contexts contained in it (size self.n-2, ..., 1) DID exist in the training corpora.

            Args:
                text (str): the text to construct the model from.
        """
        text = '< ' * (self.n - 1) + text.replace(' . ', ' .%s ' % (' <' * (self.n - 1))) + ' >'
        tokens = self.split(text)
        self.corpus_len = len(tokens)
        self.n_grams_by_len = [{} for _ in range(self.n)]
        for i in range(len(tokens)):  # for index in tokens
            for n in range(self.n):  # for n-gram size from 1 to n:
                if i >= n:  # if the index has advanced enough for this n
                    n_gram = self.join(tokens[i - n: i + 1])
                    n_grams = self.n_grams_by_len[n]  # get dict for respective n
                    n_grams[n_gram] = n_grams.get(n_gram, 0) + 1  # get dict for respective n
        return self.get_model()

    def get_model(self):
        """Returns the model as a dictionary of the form {ngram:count}
        """
        return self.n_grams_by_len[-1]  # the last dictionary in self.n_grams_by_len has ngrams of size self.n

    def generate(self, context=None, n=20):
        """Returns a string of the specified length, generated by applying the language model
        to the specified seed context. If no context is specified the context should be sampled
        from the models' contexts distribution. Generation should stop before the n'th word if the
        contexts are exhausted.

        NOTE:
        Since we collected all the n_grams from size 1 to self.n, we are able to continue generating text
        even if the contexts of size self.n-1 have been exhausted. So that's what this function does.

            Args:
                context (str): a seed context to start the generated string from. Defaults to None
                n (int): the length of the string to be generated.

            Return:
                String. The generated text.
        """
        if context is None:
            context = ''
            final_tokens = []
        else:
            final_tokens = self.split(context)
        context = ' '.join(['<'] * (self.n - 1) + [context.replace(' . ', ' .%s ' % (' <' * (self.n - 1)))])
        tokens = self.split(context)

        while len(final_tokens) < n:

            # find the longest context that exists in the model, from size self.n - 1 to 1
            history = tokens[-self.n + 1:]
            while len(history) > 0:
                histories = self.n_grams_by_len[len(history) - 1]
                if self.join(history) in histories:
                    break
                else:  # try smaller history
                    history = history[1:]

            # generate the candidate tokens with their respective weights
            candidates, weights = [], []
            len_history = len(history)
            history = self.join(history)
            for n_gram, n_gram_count in self.n_grams_by_len[len_history].items():
                n_gram_tokens = self.split(n_gram)
                n_gram_history = self.join(n_gram_tokens[:-1])
                if n_gram_history == history:
                    # candidate = n_gram_tokens[-1]
                        # if candidate != '<':
                    candidates.append(n_gram_tokens[-1])
                    weights.append(n_gram_count)

            # select a candidate and append to generated text
            selected_candidate = random.choices(candidates, weights=weights)[0]
            if selected_candidate == '>':  # text ending character
                break
            tokens.append(selected_candidate)
            if selected_candidate != '<':
                final_tokens.append(selected_candidate)

        return self.join(final_tokens)  # remove artificial prefix (made of '<' characters)

    def evaluate(self, text):
        """Returns the log-likelihod of the specified text to be generated by the model.
           Laplace smoothing should be applied if necessary.

            NOTE:
            Laplace smoothing is always applied, because otherwise I think it's almost impossible to compare the
            likelihoods of different sentences if for one sentence it's smoothed and for the other one it isn't
            (assuming that you don't really know which one was smoothed).

           Args:
               text (str): Text to ebaluate.

           Returns:
               Float. The float should reflect the (log) probability.
        """
        text = ' '.join(['<'] * (self.n - 1) + [text.replace(' . ', ' .%s ' % (' <' * (self.n - 1)))])
        tokens = self.split(text)
        sum = 0
        for i in range(len(tokens) - self.n + 1):
            n_gram = self.join(tokens[i: i + self.n])
            prob = self.get_probability(n_gram, True)
            sum += math.log(prob, self.log_base)
        return sum

    def get_probability(self, n_gram, smoothing=False):
        """
        Calculate probability of ngram in language model, which is the frequency of the n_gram's
        divided by the frequency of the history (context).

        :param n_gram: to calculate probability of
        :param smoothing: to perform Laplace smoothing or not
        :return: float from 0 to 1
        """
        if smoothing:
            # return self.smooth(n_gram)
            return self.stupid_backoff(self.split(n_gram), self.stupid_backoff_alpha)
        else:
            history = self.join(self.split(n_gram)[:-1])
            histories = self.n_grams_by_len[-2]
            history_count = histories.get(history, 0)
            n_gram_count = self.n_grams_by_len[-1].get(n_gram, 0)
            return n_gram_count / history_count

    def smooth(self, ngram):
        """Returns the smoothed (Laplace) probability of the specified ngram.

            Args:
                ngram (str): the ngram to have it's probability smoothed

            Returns:
                float. The smoothed probability.
        """
        history = self.join(self.split(ngram)[:-1])
        histories = self.n_grams_by_len[-2]
        history_count = histories.get(history, 0)
        if not history_count:  # save time by not looking for n_gram
            return 1 / len(histories)
        n_gram_count = self.n_grams_by_len[-1].get(ngram, 0)
        vocab_len = len(self.n_grams_by_len[0])
        return (n_gram_count + 1) / (history_count + vocab_len)

    def stupid_backoff(self, tokens, alpha):
        if not tokens:  # word doesnt exist
            return alpha / self.corpus_len
        ngram = self.join(tokens)
        ngrams = self.n_grams_by_len[len(tokens) - 1]
        ngram_count = ngrams.get(ngram, 0)
        if not ngram_count:
            return alpha * self.stupid_backoff(tokens[1:], alpha)
        if len(tokens) > 1:
            history = self.join(tokens[:-1])
            histories = self.n_grams_by_len[len(tokens) - 2]
            history_count = histories[history]
        else:
            history_count = self.corpus_len
        return ngram_count / history_count

        # history_count = 0
        # tokens = self.split(ngram)
        # scale = 1
        # while len(tokens) > 1:
        #     history = self.join(tokens[:-1])
        #     histories = self.n_grams_by_len[len(tokens) - 2]
        #     if history in histories:
        #         history_count = histories[history]
        #         break
        #     tokens = tokens[1:]
        #     scale *= alpha
        # ngram = self.join(tokens)
        # n_gram_count = self.n_grams_by_len[len(tokens) - 1].get(ngram, 0)
        # len_histories = len(self.n_grams_by_len[max(len(tokens) - 2, 0)])
        # return scale * (n_gram_count + 1) / (history_count + len_histories)


def normalize_text(text, lower=True, punctuations=',!?:;', chars_to_remove=r'\(\)\[\]\{\}\<\>\#*"-',
                   char_to_make_whitespace='/'):
    """Returns a normalized string based on the specifiy string.
       You can add default parameters as you like (they should have default values!)
       You should explain your decitions in the header of the function.

       Explanation:
       The sentences are not separated, as the model is expected to naturally start and end sentences
       by using the characters '.', ',', etc. The operations to normalize:
       1) make lowercase, 2) remove unnecessary characters, 3) add a space before punctuation characters,
       4) replace some chars with whitespace, 5) remove redundant whitespaces 5) treat points, to not end a sentence
       with false sentence ending points (e.g., U.S.A., 3.1416) or ellipsis (...).

       Args:
           text (str): the text to normalize
           lower: if true then make all letters lowercase
           punctuations: characters to consider as punctuations (and add a space before them)
           chars_to_remove: characters to remove from text
           char_to_make_whitespace: characters to convert to whitespace

       Returns:
           string. the normalized text.

    """
    if lower:
        text = text.lower()
    text = text.strip()  # remove trailing spaces
    text = re.sub(r'([' + chars_to_remove + '])', '', text)  # remove characters
    text = re.sub('([' + punctuations + '])', r' \1', text)  # add space before punctuations
    text = re.sub('([' + char_to_make_whitespace + '])', ' ', text)  # replace with space
    text = re.sub(r'\s+', ' ', text)  # remove redundant spaces

    # treat points especially, for the model to be able to split sentences:
    text = re.sub(r'(\. )', r' \1', text)  # add space only before points not part of abbreviations (e.g. U.S.A.)
    text = re.sub(r'\.\. \.', r' ...', text)  # join the ruined ellipsis ('...')
    if text[-1] == '.':
        text = text[:-1] + ' .'
    return text


def who_am_i():
    """Returns a ductionary with your name, id number and email. keys=['name', 'id','email']
        Make sure you return your own info!
    """
    return {'name': 'Jonathan Martinez', 'id': '201095569', 'email': 'martijon@post.bgu.ac.il'}
